{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from collections import deque\n",
    "from bitarray import bitarray"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def ihash(x):\n",
    "    h = 86813\n",
    "    while True:\n",
    "        for i in x:\n",
    "            h = ((h + i) * 127733) % (1 << 32)\n",
    "        yield h"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def bloom_filter(array_bytes, k):\n",
    "    array = bitarray(array_bytes * 8)\n",
    "    array.setall(0)\n",
    "\n",
    "    def _hash(x):\n",
    "        for _, h in zip(range(k), ihash(x)):\n",
    "            yield h % len(array)\n",
    "    \n",
    "    def _add(x):\n",
    "        for h in _hash(x):\n",
    "            array[h] = 1\n",
    "\n",
    "    def _contains(x):\n",
    "        return all(array[h] for h in _hash(x))\n",
    "\n",
    "    return _add, _contains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def measure_accuracy(A, B, array_bytes, k):\n",
    "    add, contains = bloom_filter(array_bytes, k)\n",
    "    \n",
    "    # store A\n",
    "    deque((add(x) for x in A), 0)\n",
    "\n",
    "    # find false positives in B\n",
    "    fp = sum(contains(x) for x in B)\n",
    "\n",
    "    # result\n",
    "    acc = 1 - fp / len(B)\n",
    "    print('{} hashes, {} false positives, {:.4f} accuracy'.format(k, fp, acc))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(999891, 999659)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = 10 ** 6\n",
    "A = set(map(tuple, np.random.randint(0, 256, (n, 4))))\n",
    "B = set(map(tuple, np.random.randint(0, 256, (n, 4)))) - A\n",
    "len(A), len(B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 hashes, 117551 false positives, 0.8824 accuracy\n",
      "2 hashes, 66855 false positives, 0.9331 accuracy\n",
      "3 hashes, 40262 false positives, 0.9597 accuracy\n",
      "4 hashes, 61965 false positives, 0.9380 accuracy\n"
     ]
    }
   ],
   "source": [
    "for k in [1, 2, 3, 4]:\n",
    "    measure_accuracy(A, B, n, k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 hashes, 30693 false positives, 0.9693 accuracy\n",
      "2 hashes, 5479 false positives, 0.9945 accuracy\n",
      "4 hashes, 937 false positives, 0.9991 accuracy\n",
      "6 hashes, 326 false positives, 0.9997 accuracy\n",
      "8 hashes, 454 false positives, 0.9995 accuracy\n"
     ]
    }
   ],
   "source": [
    "for k in [1, 2, 4, 6, 8]:\n",
    "    measure_accuracy(A, B, n * 4, k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
